full_data <- readRDS('data/full_data.rds')

Monthly Production Data

prod_by_month <- full_data %>% 
    select(yearmonth, active_1km, monthly_oil_1km, monthly_gas_1km, 
           monthly_water_1km, monthly_boe_1km, active_2p5km, monthly_oil_2p5km, 
           monthly_gas_2p5km, monthly_water_2p5km, monthly_boe_2p5km, active_5km, 
           monthly_oil_5km, monthly_gas_5km, monthly_water_5km, monthly_boe_5km) %>%
  distinct() %>%
  filter(yearmonth <= '2023-03')

coef <- median(1032*prod_by_month$monthly_oil_1km/10^9)/median(prod_by_month$monthly_gas_1km/10^6)

full_data %>%
  ggplot(aes(x = yearmonth, group = 1)) + 
  geom_line(aes(y = monthly_oil_1km/10^6, color = 'Oil'), size=1) + 
  geom_line(aes(y = 1032*monthly_gas_1km/10^9/coef, color = 'Natural Gas'), size=1) +
  scale_y_continuous(
    name = "Monthly Oil in Million Barrels",
    sec.axis = sec_axis(~.*coef, name="Monthly Gas in Billion Cubic Feet")
  ) +
  theme(
    axis.title.y = element_text(color = 'tomato'),
    axis.title.y.right = element_text(color = 'skyblue')
  ) +
  scale_color_manual(values = c("skyblue", "tomato")) +
  scale_x_discrete(breaks = prod_by_month$yearmonth[seq(1, length(prod_by_month$yearmonth), by = 7)]) +
  xlab("Date") + labs(colour="Production Type") +
    theme_minimal() +
  theme(axis.text.x = element_text(angle = 45))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 63301 rows containing missing values (`geom_line()`).
## Removed 63301 rows containing missing values (`geom_line()`).

Visualizing Daily Max H2S

H2S_dm <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
  mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 123 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1724: `day = 2020-08-22`, `Monitor = "Judson"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 122 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_ma <- H2S_dm %>%
  mutate(yearmonth = format(day, "%Y-%m")) %>%
  group_by(yearmonth, Monitor) %>%
  summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
H2S_dm_graph <- H2S_dm %>%
 ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Max H2S concentration by monitor", y = 'Daily Max H2S', x = 'time') +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph) %>% layout(dragmode = 'pan')
H2S_dm_graph_b <- H2S_dm %>%
 ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Max H2S concentration by monitor w/o outlier", y = 'Daily Max H2S', x = 'time') +
   scale_y_continuous(limits = c(0, 100)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
H2S_ma_graph <- H2S_ma %>%
 ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Monthly Average H2S concentration by monitor", y = 'Monthly Average H2S', x = 'time') +
   scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_ma_graph) %>% layout(dragmode = 'pan')
H2S_ma_graph_b <- H2S_ma %>%
 ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Monthly Average H2S concentration by monitor w/o outlier", y = 'Monthly Average H2S', x = 'time') +
   scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
  scale_y_continuous(limits=c(0, 50)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_ma_graph_b) %>% layout(dragmode = 'pan')

The line with the largest deviation/peak corresponds to the 213th&Chico monitor, in 2021-10 Maybe we should start from January 2022 to start modelling to remove the extreme values, or simply removing the 213 monitor since that’s set up specifically for the event.

Visualizing Daily Average H2S

# Compute daily average
H2S_da <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_avg = mean(H2S, na.rm=TRUE))
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_da_graph <- H2S_da %>%
 ggplot(aes(x=day, y=H2S_daily_avg, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Average H2S concentration by monitor", y = 'Daily Average H2S', x = 'time') +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_da_graph) %>% layout(dragmode = 'pan')
H2S_da_graph_b <- H2S_da %>%
 ggplot(aes(x=day, y=H2S_daily_avg, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Average H2S concentration by monitor w/o outlier", y = 'Daily Avearge H2S', x = 'time') +
   scale_y_continuous(limits = c(0, 100)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
full_data %>%
  polarPlot(pollutant = "H2S", col = "turbo", 
            key.position = "bottom",
            key.header = "mean H2S", 
            key.footer = NULL, title = 'hi')

# Create a polar map
# TBD: include markers for refineries
polarplot <- polarMap(
  full_data %>% 
    filter(!(is.na(latitude.x) | is.na(H2S) | is.na(wd) | is.na(ws))) %>%
    rename(date = DateTime),
  pollutant = "H2S",
  latitude = "latitude.x",
  longitude = "longitude.x",
  popup = "Monitor",
  provider = "Esri.WorldImagery",
  d.icon = 150,
  d.fig = 2.5,
  alpha = 0.75
)
## 
Creating Polar Markers â– â– â– â– â– â–  15% | ETA: 9s

Creating Polar Markers â– â– â– â– â– â– â– â–  23%
## | ETA: 8s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â–  31% | ETA: 7s

Creating Polar Markers
## â– â– â– â– â– â– â– â– â– â– â– â– â–  38% | ETA: 6s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  46% | ETA:
## 5s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  54% | ETA: 4s

Creating Polar Markers
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  62% | ETA: 4s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 
## 69% | ETA: 3s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  77% | ETA:
## 2s

Creating Polar Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  85% | ETA: 2s

Creating Polar
## Markers â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–  92% | ETA: 1s

polarplot
#saveWidget(polarplot, file="polarplot.html")
# Create variable to indicate downwind wrt refinery
wind_diff <- abs(full_data$Converted_Angle - 180 - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
full_data$downwind_ref <- as.integer(wind_diff <= 15)

Find angle between monitor and nearest water treatment plant

# Function to calculate the angle between two lat/long points
calculate_angle <- function(lat1, lon1, lat2, lon2) {

  # Calculate the bearing between the two points
  angle <- bearing(cbind(lon1, lat1), cbind(lon2, lat2))
  
  # Return the angle
  return(angle)
}

wrp <- full_data %>% 
  select(name, wrp_utm_x, wrp_utm_y) %>%
  distinct()

wrp_coord <- cbind(wrp, terra::project(as.matrix(cbind(wrp$wrp_utm_x, wrp$wrp_utm_y)), 
                            "+proj=utm +zone=11 ellps=WGS84", 
                            "+proj=longlat +datum=WGS84")) %>%
  rename(longitude.wrp = '1',
         latitude.wrp = '2')

full_data <- full_data %>%
  left_join(wrp_coord %>% select(name, latitude.wrp, longitude.wrp), join_by(name), keep=F)

mon_wrp <- full_data %>% 
              select(Monitor, name, latitude.x, longitude.x, latitude.wrp, longitude.wrp) %>%
              distinct()

mon_wrp$wrp_angle <- calculate_angle(mon_wrp$latitude.x, mon_wrp$longitude.x, 
                                     mon_wrp$latitude.wrp, mon_wrp$longitude.wrp)

mon_wrp$wrp_angle <- if_else(mon_wrp$wrp_angle < 0, mon_wrp$wrp_angle + 360, mon_wrp$wrp_angle)

full_data <- full_data %>%
  left_join(mon_wrp %>% select(Monitor, name, wrp_angle), join_by(Monitor, name), keep=F)

# Finally, check if wind direction is downwind w.r.t wrp
wind_diff <- abs(full_data$wrp_angle - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)

full_data$downwind_wrp <- as.integer(wind_diff <= 15)
# Compute daily average wd/ws
daily_full <- full_data %>%
  select(H2S, ws, wd, latitude.x, longitude.x, mon_utm_x, mon_utm_y, Monitor, MinDist, 
         Converted_Angle, Refinery, year, month, day, weekday, 
         monthly_oil_1km, monthly_gas_1km, monthly_oil_2p5km, 
         monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km,
         dist_wrp, capacity, active_1km, active_2p5km, active_5km,
         wrp_angle) %>%
  group_by(Monitor, day) %>%
  mutate(H2S_daily_max = max(H2S, na.rm=TRUE),
            H2S_daily_avg = mean(H2S, na.rm=TRUE),
            ws_avg = mean(ws, na.rm=TRUE),
            wd_avg = as.numeric(mean(circular(wd, units = 'degrees'), na.rm=TRUE))) %>%
  mutate(wd_avg = if_else(wd_avg < 0, wd_avg+360, wd_avg)) %>%
  ungroup() %>%
  rename(monitor_lat = latitude.x, monitor_lon = longitude.x) %>%
  mutate(H2S_daily_max = ifelse(H2S_daily_max == -Inf, NA, H2S_daily_max)) %>%
  select(-c(H2S, wd, ws)) %>%
  distinct()
## Warning: There were 2449 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1637: `Monitor = "First Methodist"`, `day = 2022-11-23`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 2448 remaining warnings.
# Get the downwind indicators for daily data
wind_diff <- abs(daily_full$Converted_Angle - 180 - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
daily_full$daily_downwind_ref <- wind_diff

wind_diff <- abs(daily_full$wrp_angle - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)

daily_full$daily_downwind_wrp <- as.integer(wind_diff <= 15)

Get monitor locations

monitors_coord <- full_data %>%
  select(Monitor, latitude.x, longitude.x) %>%
  distinct()

monitors <- monitors_coord %>%
  select(Monitor)
  
coordinates(monitors_coord) <- ~ longitude.x + latitude.x

Load in elevation data

elevation <- raster('shapefiles/N33W119.hgt')

monitors <- cbind(monitors, extract(elevation, monitors_coord)) %>%
  as.data.frame() %>%
  rename(elevation = 2)

Load in EVI data

evi <- raster('shapefiles/MOD13Q1.006__250m_16_days_EVI_doy2022177_aid0001.tif')

monitors <- cbind(monitors, extract(evi, monitors_coord) * 0.0001) %>%
  as.data.frame() %>%
  rename(EVI = 3)

Merge EVI and elevation to data

full_data <- full_data %>%
  left_join(monitors, join_by(Monitor))

daily_full <- daily_full %>%
  left_join(monitors, join_by(Monitor))

Explore some GAM models

Five minute H2S

h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.172e+00  3.213e-03  364.72   <2e-16 ***
## year2023       -3.263e-01  2.800e-03 -116.54   <2e-16 ***
## wd_secQ2       -2.560e-01  3.751e-03  -68.23   <2e-16 ***
## wd_secQ3       -2.898e-01  3.892e-03  -74.46   <2e-16 ***
## wd_secQ4       -2.095e-01  3.492e-03  -59.99   <2e-16 ***
## ws             -5.424e-02  4.176e-04 -129.91   <2e-16 ***
## I(1/MinDist^2)  2.114e+05  2.329e+03   90.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.996      8 2496  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0946   Deviance explained = 9.47%
## GCV = 0.92965  Scale est. = 0.92963   n = 772251
plot(h2s_model_a)

h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
                   s(latitude.x, longitude.x, bs='tp', k = 8), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) + 
##     s(latitude.x, longitude.x, bs = "tp", k = 8)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.012e+00  3.171e-03  319.16   <2e-16 ***
## wd_secQ2       -1.954e-01  3.696e-03  -52.87   <2e-16 ***
## wd_secQ3       -1.822e-01  3.905e-03  -46.65   <2e-16 ***
## wd_secQ4       -1.129e-01  3.487e-03  -32.39   <2e-16 ***
## ws             -6.921e-02  4.147e-04 -166.90   <2e-16 ***
## I(1/MinDist^2)  3.002e+05  2.858e+03  105.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                             edf Ref.df    F p-value    
## s(as.numeric(month))      7.999      8 1708  <2e-16 ***
## s(latitude.x,longitude.x) 6.999      7 6523  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.13   Deviance explained =   13%
## GCV = 0.8932  Scale est. = 0.89317   n = 772251
plot(h2s_model_b)

# Include converted angle and weekday
h2s_model_c <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + wd_sec + ws + 
                     I(1/MinDist^2) + Converted_Angle + 
                     s(latitude.x, longitude.x, bs='tp', k = 10) + 
                     as.factor(weekday), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2) + Converted_Angle + s(latitude.x, longitude.x, 
##     bs = "tp", k = 10) + as.factor(weekday)
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           2.420e+00  2.374e-02  101.947  < 2e-16 ***
## year2023             -3.327e-01  2.864e-03 -116.135  < 2e-16 ***
## wd_secQ2             -1.858e-01  3.643e-03  -51.011  < 2e-16 ***
## wd_secQ3             -1.897e-01  3.846e-03  -49.325  < 2e-16 ***
## wd_secQ4             -1.090e-01  3.441e-03  -31.663  < 2e-16 ***
## ws                   -6.128e-02  4.076e-04 -150.319  < 2e-16 ***
## I(1/MinDist^2)        1.112e-04  1.961e-06   56.689  < 2e-16 ***
## Converted_Angle      -5.775e-03  1.129e-04  -51.138  < 2e-16 ***
## as.factor(weekday).L  9.867e-02  2.801e-03   35.228  < 2e-16 ***
## as.factor(weekday).Q -1.699e-01  2.808e-03  -60.505  < 2e-16 ***
## as.factor(weekday).C -3.883e-03  2.803e-03   -1.385    0.166    
## as.factor(weekday)^4 -2.054e-02  2.814e-03   -7.299  2.9e-13 ***
## as.factor(weekday)^5 -5.798e-02  2.807e-03  -20.654  < 2e-16 ***
## as.factor(weekday)^6 -2.507e-02  2.820e-03   -8.891  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                             edf Ref.df    F p-value    
## s(as.numeric(month))      7.997      8 2708  <2e-16 ***
## s(latitude.x,longitude.x) 8.999      9 6588  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 30/31
## R-sq.(adj) =  0.156   Deviance explained = 15.6%
## GCV = 0.86621  Scale est. = 0.86618   n = 772251
plot(h2s_model_c)

# Include converted angle and weekday
h2s_model_d <- gam(H2S ~ 
                     s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                     wd_sec + ws + downwind_ref + downwind_wrp +
                     I(1/MinDist^2) + dist_wrp + capacity +
                     Converted_Angle + 
                     s(latitude.x, longitude.x, bs='tp', k = 10) + 
                     monthly_oil_1km + monthly_gas_1km + active_1km
                     , 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) + 
##     dist_wrp + capacity + Converted_Angle + s(latitude.x, longitude.x, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          -2.757e+00  5.816e-02  -47.406  < 2e-16 ***
## year2023             -1.356e-01  6.865e-03  -19.751  < 2e-16 ***
## as.factor(weekday).L  1.082e-01  2.989e-03   36.201  < 2e-16 ***
## as.factor(weekday).Q -1.838e-01  2.995e-03  -61.371  < 2e-16 ***
## as.factor(weekday).C -8.157e-04  2.981e-03   -0.274  0.78432    
## as.factor(weekday)^4 -1.475e-02  2.983e-03   -4.945 7.61e-07 ***
## as.factor(weekday)^5 -5.943e-02  2.975e-03  -19.979  < 2e-16 ***
## as.factor(weekday)^6 -2.875e-02  2.985e-03   -9.631  < 2e-16 ***
## wd_secQ2             -1.520e-01  3.935e-03  -38.615  < 2e-16 ***
## wd_secQ3             -1.310e-01  4.161e-03  -31.492  < 2e-16 ***
## wd_secQ4             -7.165e-02  3.677e-03  -19.487  < 2e-16 ***
## ws                   -7.086e-02  4.333e-04 -163.531  < 2e-16 ***
## downwind_ref          1.444e-01  4.136e-03   34.917  < 2e-16 ***
## downwind_wrp         -1.370e-02  4.575e-03   -2.993  0.00276 ** 
## I(1/MinDist^2)        4.732e-05  9.908e-07   47.760  < 2e-16 ***
## dist_wrp              5.576e-04  6.067e-06   91.915  < 2e-16 ***
## capacity              4.449e-03  4.555e-05   97.666  < 2e-16 ***
## Converted_Angle      -2.954e-03  1.232e-04  -23.966  < 2e-16 ***
## monthly_oil_1km       5.172e-05  2.488e-06   20.787  < 2e-16 ***
## monthly_gas_1km       2.546e-04  1.226e-05   20.767  < 2e-16 ***
## active_1km           -2.863e-02  1.076e-03  -26.604  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                             edf Ref.df      F p-value    
## s(as.numeric(month))      7.998  8.000  988.2  <2e-16 ***
## s(latitude.x,longitude.x) 8.003  8.004 5198.0  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 36/38
## R-sq.(adj) =  0.162   Deviance explained = 16.2%
## GCV = 0.90017  Scale est. = 0.90012   n = 711593
plot(h2s_model_d)

Daily max H2S

# With month, year, wind sector/speed, dist to refinery, refinery
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_avg+ws_avg+
                        I(1/MinDist^2), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_avg + 
##     ws_avg + I(1/MinDist^2)
## 
## Parametric coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.062e+00  3.468e-01   8.829   <2e-16 ***
## year2023       -4.367e-01  2.197e-01  -1.988   0.0469 *  
## wd_avg          5.555e-04  1.031e-03   0.539   0.5903    
## ws_avg         -8.938e-02  6.342e-02  -1.409   0.1588    
## I(1/MinDist^2)  7.943e-07  8.452e-08   9.399   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F p-value    
## s(as.numeric(month)) 2.171      8 2.002   7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 12/13
## R-sq.(adj) =  0.00868   Deviance explained = 1.06%
## GCV = 21.458  Scale est. = 21.408    n = 2664
plot(h2s_dm_model_a)

# Also with location of monitor
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Refinery+s(monitor_lat, monitor_lon, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Refinery + s(monitor_lat, monitor_lon, bs = "tp", 
##     k = 10)
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       9.085e+00  1.486e+01   0.612  0.54088    
## wd_avg                            2.922e-03  1.008e-03   2.900  0.00376 ** 
## ws_avg                           -3.126e-01  6.339e-02  -4.931  8.7e-07 ***
## I(1/MinDist^2)                   -1.612e-05  1.834e-03  -0.009  0.99299    
## RefineryMarathon (Carson)        -2.861e-01  1.840e+01  -0.016  0.98759    
## RefineryMarathon (Wilmington)    -3.058e+00  1.820e+01  -0.168  0.86659    
## RefineryPhillips 66 (Wilmington) -1.378e+01  1.702e+01  -0.809  0.41840    
## RefineryTorrance Refinery        -2.545e+00  1.470e+01  -0.173  0.86261    
## RefineryValero Refinery          -6.947e+00  1.842e+01  -0.377  0.70604    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F  p-value    
## s(as.numeric(month))       2.026  8.000 1.698 0.000225 ***
## s(monitor_lat,monitor_lon) 5.408  5.699 9.292 6.79e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 25/26
## R-sq.(adj) =  0.127   Deviance explained = 13.1%
## GCV = 18.972  Scale est. = 18.862    n = 2664
plot(h2s_dm_model_b)

# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Converted_Angle+
                        s(monitor_lat, monitor_lon, bs='tp', k=10), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Converted_Angle + s(monitor_lat, monitor_lon, 
##     bs = "tp", k = 10)
## 
## Parametric coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.630e+00  8.823e-01   4.114    4e-05 ***
## wd_avg           2.807e-03  1.012e-03   2.775  0.00555 ** 
## ws_avg          -2.243e-01  6.210e-02  -3.612  0.00031 ***
## I(1/MinDist^2)  -8.554e-06  3.810e-05  -0.225  0.82237    
## Converted_Angle -3.393e-03  3.891e-03  -0.872  0.38337    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F  p-value    
## s(as.numeric(month))       2.103  8.000  2.015 5.44e-05 ***
## s(monitor_lat,monitor_lon) 7.070  7.918 41.050  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 21/22
## R-sq.(adj) =  0.116   Deviance explained = 12.1%
## GCV = 19.175  Scale est. = 19.08     n = 2664
plot(h2s_dm_model_c)

h2s_dm_model_d <- gam(H2S_daily_max ~ 
                         s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + dist_wrp + capacity +
                         Converted_Angle + 
                         s(monitor_lat, monitor_lon, bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km
                      , 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + dist_wrp + 
##     capacity + Converted_Angle + s(monitor_lat, monitor_lon, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -8.823e+00  4.562e+00  -1.934 0.053195 .  
## year2023             -3.810e-01  4.427e-01  -0.861 0.389493    
## as.factor(weekday).L  4.554e-01  2.416e-01   1.885 0.059594 .  
## as.factor(weekday).Q -9.103e-01  2.420e-01  -3.761 0.000173 ***
## as.factor(weekday).C -5.222e-03  2.402e-01  -0.022 0.982657    
## as.factor(weekday)^4 -7.991e-02  2.399e-01  -0.333 0.739104    
## as.factor(weekday)^5 -3.462e-01  2.390e-01  -1.449 0.147562    
## as.factor(weekday)^6 -1.746e-01  2.393e-01  -0.730 0.465696    
## wd_avg                3.345e-03  1.084e-03   3.087 0.002047 ** 
## ws_avg               -3.166e-01  7.027e-02  -4.505 6.94e-06 ***
## daily_downwind_ref   -7.402e-03  2.082e-03  -3.555 0.000386 ***
## I(1/MinDist^2)        8.678e-05  3.693e-04   0.235 0.814227    
## dist_wrp              1.856e-03  6.652e-04   2.791 0.005301 ** 
## capacity              7.034e-03  1.116e-02   0.630 0.528477    
## Converted_Angle       7.174e-03  9.307e-03   0.771 0.440909    
## monthly_oil_1km      -1.698e-05  1.291e-04  -0.132 0.895365    
## monthly_gas_1km      -2.722e-04  5.914e-04  -0.460 0.645368    
## active_1km            1.892e-03  4.268e-02   0.044 0.964638    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F p-value    
## s(as.numeric(month))       1.722  8.000  0.963 0.00486 ** 
## s(monitor_lat,monitor_lon) 7.733  7.963 13.249 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 34/35
## R-sq.(adj) =   0.14   Deviance explained = 14.9%
## GCV = 20.119  Scale est. = 19.9      n = 2428
plot(h2s_dm_model_d)

# Try to include as many variables as possible
h2s_dm_model_e <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))

summary(h2s_dm_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               6.825e+00  1.777e+00   3.840 0.000126 ***
## year2023                 -3.738e-01  4.414e-01  -0.847 0.397189    
## as.character(weekday)Mon -6.566e-01  3.370e-01  -1.948 0.051492 .  
## as.character(weekday)Sat -7.142e-01  3.433e-01  -2.080 0.037592 *  
## as.character(weekday)Sun -1.154e+00  3.411e-01  -3.384 0.000727 ***
## as.character(weekday)Thu -3.028e-01  3.414e-01  -0.887 0.375110    
## as.character(weekday)Tue -1.035e-01  3.361e-01  -0.308 0.758036    
## as.character(weekday)Wed  6.118e-02  3.404e-01   0.180 0.857392    
## wd_avg                    3.315e-03  1.085e-03   3.055 0.002275 ** 
## ws_avg                   -3.129e-01  6.992e-02  -4.475 8.01e-06 ***
## daily_downwind_ref       -6.873e-03  2.071e-03  -3.318 0.000919 ***
## I(1/dist_wrp^2)          -1.718e-06  2.759e-07  -6.226 5.64e-10 ***
## monthly_oil_1km          -2.111e-05  1.269e-04  -0.166 0.867886    
## monthly_gas_1km          -2.160e-04  5.920e-04  -0.365 0.715256    
## active_1km                6.188e-03  4.147e-02   0.149 0.881386    
## daily_downwind_wrp        2.502e-02  4.310e-01   0.058 0.953715    
## elevation                -9.530e-02  6.145e-02  -1.551 0.121047    
## EVI                      -2.977e+00  7.777e-01  -3.828 0.000133 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                           edf Ref.df      F
## s(as.numeric(month))                                    1.201  8.000  0.508
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                  7.867  8.599 14.844
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 2.434 77.000  0.062
##                                                         p-value    
## s(as.numeric(month))                                     0.0115 *  
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  0.0289 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/112
## R-sq.(adj) =  0.141   Deviance explained = 15.1%
## GCV = 20.111  Scale est. = 19.875    n = 2428
plot(h2s_dm_model_e)

e <- getViz(h2s_dm_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

plotRGL(sm(e, 3), fix = c(`as.numeric(day)` = 0), residuals = F)

# try a plot using wireframe
plot3d <- matrix(fitted(h2s_dm_model_e), ncol = 20)
## Warning in matrix(fitted(h2s_dm_model_e), ncol = 20): data length [2428] is not
## a sub-multiple or multiple of the number of rows [122]
wireframe(plot3d, drape=TRUE, colorkey=TRUE)

Daily Average

# Look at daily average
h2s_da_model_a <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
                         Converted_Angle + 
                         s(monitor_lon, monitor_lat, bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + Converted_Angle + s(monitor_lon, monitor_lat, 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.834e+00  6.193e-01  -6.190 7.07e-10 ***
## year2023             -1.164e-01  5.891e-02  -1.976  0.04829 *  
## as.factor(weekday).L  7.609e-02  2.493e-02   3.052  0.00230 ** 
## as.factor(weekday).Q -1.646e-01  2.494e-02  -6.601 5.02e-11 ***
## as.factor(weekday).C  2.708e-02  2.475e-02   1.094  0.27399    
## as.factor(weekday)^4  1.281e-02  2.472e-02   0.518  0.60426    
## as.factor(weekday)^5 -4.826e-02  2.462e-02  -1.960  0.05008 .  
## as.factor(weekday)^6 -4.529e-02  2.465e-02  -1.837  0.06626 .  
## wd_avg                6.843e-04  1.121e-04   6.102 1.22e-09 ***
## ws_avg               -1.076e-01  7.473e-03 -14.400  < 2e-16 ***
## daily_downwind_ref   -2.637e-03  2.149e-04 -12.270  < 2e-16 ***
## I(1/MinDist^2)        2.522e-04  3.082e-05   8.184 4.41e-16 ***
## I(1/dist_wrp^2)      -1.355e-05  2.262e-06  -5.992 2.38e-09 ***
## capacity              1.374e-02  1.201e-03  11.434  < 2e-16 ***
## Converted_Angle      -2.711e-03  1.026e-03  -2.643  0.00828 ** 
## monthly_oil_1km       4.684e-05  1.939e-05   2.416  0.01576 *  
## monthly_gas_1km       1.977e-04  9.844e-05   2.008  0.04478 *  
## active_1km           -2.225e-02  8.342e-03  -2.667  0.00771 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df      F p-value    
## s(as.numeric(month))       7.412      8  9.635  <2e-16 ***
## s(monitor_lon,monitor_lat) 8.981      9 74.675  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 33/35
## R-sq.(adj) =   0.42   Deviance explained = 42.8%
## GCV = 0.21367  Scale est. = 0.21082   n = 2428
plot(h2s_da_model_a)

a <- getViz(h2s_da_model_a)
plot(sm(a, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Look at daily average
h2s_da_model_b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                         I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
                         Converted_Angle + 
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) + 
##     capacity + Converted_Angle + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), 
##     bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km + 
##     active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.530e+00  5.179e-01  -4.886 1.09e-06 ***
## year2023             -1.165e-01  5.891e-02  -1.977  0.04812 *  
## as.factor(weekday).L  7.609e-02  2.493e-02   3.052  0.00230 ** 
## as.factor(weekday).Q -1.646e-01  2.494e-02  -6.600 5.04e-11 ***
## as.factor(weekday).C  2.708e-02  2.475e-02   1.094  0.27394    
## as.factor(weekday)^4  1.280e-02  2.472e-02   0.518  0.60447    
## as.factor(weekday)^5 -4.824e-02  2.462e-02  -1.959  0.05017 .  
## as.factor(weekday)^6 -4.527e-02  2.465e-02  -1.837  0.06637 .  
## wd_avg                6.842e-04  1.121e-04   6.101 1.22e-09 ***
## ws_avg               -1.075e-01  7.472e-03 -14.392  < 2e-16 ***
## daily_downwind_ref   -2.639e-03  2.149e-04 -12.280  < 2e-16 ***
## I(1/MinDist^2)        1.111e-04  1.515e-05   7.332 3.09e-13 ***
## I(1/dist_wrp^2)      -8.496e-06  1.082e-06  -7.849 6.26e-15 ***
## capacity              1.047e-02  8.976e-04  11.663  < 2e-16 ***
## Converted_Angle      -2.584e-03  1.007e-03  -2.567  0.01031 *  
## monthly_oil_1km       4.685e-05  1.939e-05   2.417  0.01574 *  
## monthly_gas_1km       1.977e-04  9.844e-05   2.009  0.04469 *  
## active_1km           -2.225e-02  8.341e-03  -2.668  0.00768 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df      F p-value    
## s(as.numeric(month))                   7.412      8  9.638  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.970      9 75.189  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 33/35
## R-sq.(adj) =   0.42   Deviance explained = 42.8%
## GCV = 0.21367  Scale est. = 0.21082   n = 2428
plot(h2s_da_model_b)

b <- getViz(h2s_da_model_b)
plot(sm(b, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_c <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                          
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     monthly_oil_1km + monthly_gas_1km + active_1km
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.202e+00  2.347e-01   5.123 3.25e-07 ***
## year2023             -1.397e-01  6.092e-02  -2.293 0.021906 *  
## as.factor(weekday).L  7.543e-02  2.579e-02   2.925 0.003472 ** 
## as.factor(weekday).Q -1.608e-01  2.580e-02  -6.235 5.31e-10 ***
## as.factor(weekday).C  2.784e-02  2.560e-02   1.088 0.276842    
## as.factor(weekday)^4  1.114e-02  2.556e-02   0.436 0.662989    
## as.factor(weekday)^5 -4.261e-02  2.546e-02  -1.674 0.094323 .  
## as.factor(weekday)^6 -4.076e-02  2.549e-02  -1.599 0.109968    
## wd_avg                6.259e-04  1.159e-04   5.401 7.29e-08 ***
## ws_avg               -8.586e-02  7.483e-03 -11.474  < 2e-16 ***
## daily_downwind_ref   -3.152e-03  2.176e-04 -14.486  < 2e-16 ***
## I(1/dist_wrp^2)       2.212e-07  5.698e-08   3.882 0.000106 ***
## monthly_oil_1km       4.885e-05  2.006e-05   2.435 0.014979 *  
## monthly_gas_1km       2.139e-04  1.018e-04   2.100 0.035812 *  
## active_1km           -2.430e-02  8.632e-03  -2.815 0.004913 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df      F p-value    
## s(as.numeric(month))                   7.420  8.000  9.823  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.884  8.996 89.032  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 31/32
## R-sq.(adj) =   0.38   Deviance explained = 38.7%
## GCV = 0.2284  Scale est. = 0.22555   n = 2428
plot(h2s_da_model_c)

c <- getViz(h2s_da_model_c)
plot(sm(c, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_d <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                          
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_d)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     monthly_oil_1km + monthly_gas_1km + active_1km + daily_downwind_wrp + 
##     elevation + EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1.634e+00  2.551e-01   6.405 1.81e-10 ***
## year2023                 -1.164e-01  5.893e-02  -1.976  0.04826 *  
## as.character(weekday)Mon -7.772e-02  3.479e-02  -2.234  0.02558 *  
## as.character(weekday)Sat -7.981e-02  3.538e-02  -2.256  0.02418 *  
## as.character(weekday)Sun -1.778e-01  3.519e-02  -5.053 4.69e-07 ***
## as.character(weekday)Thu -3.117e-02  3.521e-02  -0.885  0.37607    
## as.character(weekday)Tue  1.504e-02  3.468e-02   0.434  0.66468    
## as.character(weekday)Wed  6.735e-02  3.513e-02   1.917  0.05532 .  
## wd_avg                    6.833e-04  1.122e-04   6.092 1.30e-09 ***
## ws_avg                   -1.078e-01  7.481e-03 -14.412  < 2e-16 ***
## daily_downwind_ref       -2.633e-03  2.151e-04 -12.242  < 2e-16 ***
## I(1/dist_wrp^2)          -1.815e-07  3.608e-08  -5.032 5.21e-07 ***
## monthly_oil_1km           4.688e-05  1.939e-05   2.418  0.01568 *  
## monthly_gas_1km           1.990e-04  9.846e-05   2.021  0.04336 *  
## active_1km               -2.230e-02  8.343e-03  -2.673  0.00757 ** 
## daily_downwind_wrp        8.934e-03  4.445e-02   0.201  0.84072    
## elevation                -1.335e-02  7.426e-03  -1.798  0.07225 .  
## EVI                      -1.034e+00  8.072e-02 -12.806  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf Ref.df      F p-value    
## s(as.numeric(month))                   7.411  8.000  9.656  <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.890  8.996 49.188  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 34/35
## R-sq.(adj) =   0.42   Deviance explained = 42.8%
## GCV = 0.21384  Scale est. = 0.2109    n = 2428
plot(h2s_da_model_d)

d <- getViz(h2s_da_model_d)
plot(sm(d, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_e <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
                         wd_avg + ws_avg + daily_downwind_ref + 
                          I(1/dist_wrp^2) +
                         s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) + 
                        te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
                         monthly_oil_1km + monthly_gas_1km + active_1km + 
                        daily_downwind_wrp + elevation + EVI, 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_e)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) + 
##     wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) + 
##     s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) + 
##     te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), 
##         k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km + 
##     monthly_gas_1km + active_1km + daily_downwind_wrp + elevation + 
##     EVI
## 
## Parametric coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.008e+00  5.098e-01   3.939 8.44e-05 ***
## year2023                  1.847e-01  1.435e-01   1.287  0.19806    
## as.character(weekday)Mon -8.023e-02  2.857e-02  -2.808  0.00503 ** 
## as.character(weekday)Sat -8.077e-02  2.902e-02  -2.783  0.00543 ** 
## as.character(weekday)Sun -1.828e-01  2.890e-02  -6.325 3.04e-10 ***
## as.character(weekday)Thu -3.405e-02  2.892e-02  -1.177  0.23919    
## as.character(weekday)Tue  1.226e-02  2.853e-02   0.430  0.66740    
## as.character(weekday)Wed  5.745e-02  2.884e-02   1.992  0.04646 *  
## wd_avg                    5.567e-04  9.687e-05   5.747 1.03e-08 ***
## ws_avg                   -1.028e-01  6.380e-03 -16.105  < 2e-16 ***
## daily_downwind_ref       -2.050e-03  1.944e-04 -10.546  < 2e-16 ***
## I(1/dist_wrp^2)           7.189e-08  1.175e-07   0.612  0.54081    
## monthly_oil_1km           5.153e-05  3.252e-05   1.585  0.11320    
## monthly_gas_1km           5.219e-05  1.949e-04   0.268  0.78888    
## active_1km               -9.009e-03  1.252e-02  -0.720  0.47179    
## daily_downwind_wrp        2.112e-02  3.715e-02   0.569  0.56968    
## elevation                -6.188e-02  1.024e-02  -6.045 1.73e-09 ***
## EVI                      -1.445e+00  1.269e-01 -11.394  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                                            edf Ref.df      F
## s(as.numeric(month))                                     7.506      8  3.636
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   9.000      9 22.836
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.687     77 16.140
##                                                          p-value    
## s(as.numeric(month))                                    0.000156 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3))                   < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day))  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 111/112
## R-sq.(adj) =  0.611   Deviance explained = 62.9%
## GCV = 0.14809  Scale est. = 0.14143   n = 2428
plot(h2s_da_model_e)

d <- getViz(h2s_da_model_e)
plot(sm(d, 2)) + l_fitRaster() + l_fitContour() + l_points()

plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)

# try a plot using wireframe
plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
## Warning in matrix(fitted(h2s_da_model_e), ncol = 20): data length [2428] is not
## a sub-multiple or multiple of the number of rows [122]
wireframe(plot3d, drape=TRUE, colorkey=TRUE)

Monthly

h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       2.299e+00  6.337e-01   3.628 0.000285 ***
## year2021                          7.513e+00  3.587e-01  20.948  < 2e-16 ***
## year2022                         -9.719e+00  3.990e-01 -24.359  < 2e-16 ***
## year2023                         -4.519e+00  5.031e-01  -8.983  < 2e-16 ***
## wd_secQ2                         -1.993e-01  3.750e-01  -0.531 0.595141    
## wd_secQ3                          3.380e+00  3.813e-01   8.865  < 2e-16 ***
## wd_secQ4                         -1.999e+00  3.604e-01  -5.546 2.93e-08 ***
## ws                                1.983e-01  4.050e-02   4.898 9.68e-07 ***
## I(1/MinDist^2)                    5.375e+05  3.251e+05   1.653 0.098241 .  
## RefineryMarathon (Carson)         4.189e+01  5.190e-01  80.710  < 2e-16 ***
## RefineryMarathon (Wilmington)     4.183e+00  6.036e-01   6.930 4.21e-12 ***
## RefineryPhillips 66 (Wilmington)  3.237e+00  5.332e-01   6.070 1.28e-09 ***
## RefineryTorrance Refinery        -3.331e+00  4.910e-01  -6.785 1.16e-11 ***
## RefineryValero Refinery           6.077e+00  5.903e-01  10.294  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.999      8 4076  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0271   Deviance explained = 2.71%
## GCV =  24024  Scale est. = 24024     n = 2056378
plot(h2s_ma_model_a)

h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude.x, longitude.x, bs='tp', k=10), data = full_data)
summary(h2s_ma_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery + s(latitude.x, longitude.x, 
##     bs = "tp", k = 10)
## 
## Parametric coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       247.73043 1330.12474   0.186 0.852252    
## year2021                            4.46631    0.34810  12.830  < 2e-16 ***
## year2022                           -5.16925    0.39625 -13.045  < 2e-16 ***
## year2023                            7.01499    0.49759  14.098  < 2e-16 ***
## wd_secQ2                            0.75576    0.34603   2.184 0.028956 *  
## wd_secQ3                            0.04189    0.35248   0.119 0.905406    
## wd_secQ4                           -1.22491    0.33302  -3.678 0.000235 ***
## ws                                 -0.01374    0.03742  -0.367 0.713562    
## I(1/MinDist^2)                      1.80216    0.32232   5.591 2.26e-08 ***
## RefineryMarathon (Carson)        -768.08881 1478.49393  -0.520 0.603407    
## RefineryMarathon (Wilmington)      93.90759 1452.37996   0.065 0.948447    
## RefineryPhillips 66 (Wilmington)    0.48764 1513.24530   0.000 0.999743    
## RefineryTorrance Refinery        -241.61130 1400.50381  -0.173 0.863031    
## RefineryValero Refinery          -388.51087 1432.72236  -0.271 0.786261    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                             edf Ref.df     F p-value    
## s(as.numeric(month))      7.999      8  4850  <2e-16 ***
## s(latitude.x,longitude.x) 7.000      7 51324  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 30/31
## R-sq.(adj) =  0.172   Deviance explained = 17.2%
## GCV =  20447  Scale est. = 20446     n = 2056378
plot(h2s_ma_model_b)

Try XGBoost on Daily Average

# Cross validation by leaving one monitor out 
daily_full <- daily_full %>%
  mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
         Monitor = str_replace_all(Monitor, ' ', '_'),
         weekday = as.character(weekday),
         daily_downwind_ref = as.integer(daily_downwind_ref),
         daily_downwind_wrp = as.integer(daily_downwind_wrp))

train <- daily_full %>%
  filter(Monitor != 'Harbor_Park')

train <- train[complete.cases(train),] %>%
  filter(day >= '2022-02-01')

train <- fastDummies::dummy_cols(train %>%
                   select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
                             monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max,
                             mon_utm_x, mon_utm_y)) %>%
                   mutate(MinDist = 1/(MinDist^2)),
                   remove_selected_columns = TRUE)

test <- daily_full %>%
  filter(Monitor == 'Harbor_Park')

test <- fastDummies::dummy_cols(test[complete.cases(test),] %>%
                   ungroup() %>%
                   filter(day >= '2022-02-01') %>%
                   select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
                             monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max,
                             mon_utm_x, mon_utm_y)) %>%
                   mutate(MinDist = 1/(MinDist^2)),
                   remove_selected_columns = TRUE)

# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
                         max_depth = c(3, 4, 5),
                         eta = c(0.1, 0.3),
                         gamma = c(0.01, 0.001),
                         colsample_bytree = c(0.5, 1),
                         min_child_weight = 0,
                         subsample = c(0.5, 0.75, 1))

# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", 
                        number=10,
                        verboseIter=TRUE, 
                        search='grid')

fit.xgb_da <- readRDS('rfiles/fit.xgb_da.rds')
# fit.xgb_da <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da, 'rfiles/fit.xgb_da.rds')
getTrainPerf(fit.xgb_da)
# Get testing performance on Harbor Park monitor
predictions <- predict(fit.xgb_da$finalModel, 
                       newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
R2(test[complete.cases(test),] %>% pull(H2S_daily_avg), predictions)
## [1] 0.2309242
fit.xgb_da$finalModel
## ##### xgb.Booster
## raw: 488.6 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "5", gamma = "0.001", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 37 
## niter: 200
## nfeatures : 37 
## xNames : monitor_lat monitor_lon MinDist Converted_Angle monthly_oil_1km monthly_gas_1km dist_wrp capacity active_1km wrp_angle ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 77     200         5 0.1 0.001              0.5                0      0.75
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da,scale=FALSE)
plot(imp, top=10)

SHAP for XGBoost above (Caret)

matrix <- test %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))  
xgb.plot.shap(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 12, n_col = 3)
## Warning in min(diff(x_uniq), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
## Warning in stats::runif(length(x), -amount, amount): NAs produced
## Warning in min(diff(x_uniq), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
## Warning in stats::runif(length(x), -amount, amount): NAs produced
## Warning in min(diff(x_uniq), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
## Warning in stats::runif(length(x), -amount, amount): NAs produced
## Warning in min(diff(x_uniq), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
## Warning in stats::runif(length(x), -amount, amount): NAs produced
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 43
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in min(diff(x_uniq), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
## Warning in stats::runif(length(x), -amount, amount): NAs produced
## Warning in min(diff(x_uniq), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
## Warning in stats::runif(length(x), -amount, amount): NAs produced

xgb.ggplot.shap.summary(data = matrix, 
              model = fit.xgb_da$finalModel, top_n = 10)

CV by leaving each monitor out once

post2022feb <- daily_full %>% 
  filter(day >= '2022-02-01')

monitor_names <- unique(post2022feb$Monitor)

# for (i in 1:length(monitor_names)) {
#   train <- post2022feb %>%
#     filter(Monitor != monitor_names[i])
#   
#   train <- train[complete.cases(train),]
# 
#   train <- fastDummies::dummy_cols(train %>%
#                      select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
#                                monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max,
#                                mon_utm_x, mon_utm_y)) %>%
#                      mutate(MinDist = 1/(MinDist^2)),
#                      remove_selected_columns = TRUE)
# 
#   fit.xgb_da <- train(H2S_daily_avg~.,
#                  method = 'xgbTree',
#                  data = train,
#                  trControl=control,
#                  tuneGrid = tune_grid,
#                  tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
#   
#   saveRDS(fit.xgb_da, paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
# }
model_stats <- tibble(Monitor = character(), train_r2 = numeric(), test_r2 = numeric())

add_cols <- function(df, cols) {
  add <- cols[!cols %in% names(df)]
  if(length(add) !=0 ) df[add] <- NA
  return(df)
}

for (i in setdiff(1:length(monitor_names), c(9))) {

  fit.xgb_da <- readRDS(paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
  
  test <- post2022feb %>%
    filter(Monitor == monitor_names[i])

  test <- fastDummies::dummy_cols(test[complete.cases(test),] %>%
                     ungroup() %>%
                     select(-c(Refinery, Monitor, day, active_2p5km, active_5km, monthly_oil_2p5km,
                           monthly_gas_2p5km, monthly_oil_5km, monthly_gas_5km, H2S_daily_max,
                           mon_utm_x, mon_utm_y)) %>%
                       add_cols(c('year_2022', 'year_2023', 'month_01', 'month_02',
                                  'month_03', 'month_04', 'month_05', 'month_06', 
                                  'month_07', 'month_08', 'month_09', 'month_10',
                                  'month_11', 'month_12')) %>%
                     mutate(MinDist = 1/(MinDist^2),
                            year_2022 = ifelse(is.na(year_2022), 0, year_2022),
                            year_2023 = ifelse(is.na(year_2023), 0, year_2023),
                            month_01 = ifelse(is.na(month_01), 0, month_01),
                            month_02 = ifelse(is.na(month_02), 0, month_02),
                            month_03 = ifelse(is.na(month_03), 0, month_03),
                            month_04 = ifelse(is.na(month_04), 0, month_04),
                            month_05 = ifelse(is.na(month_05), 0, month_05),
                            month_06 = ifelse(is.na(month_06), 0, month_06),
                            month_07 = ifelse(is.na(month_07), 0, month_07),
                            month_08 = ifelse(is.na(month_08), 0, month_08),
                            month_09 = ifelse(is.na(month_09), 0, month_09),
                            month_10 = ifelse(is.na(month_10), 0, month_10),
                            month_11 = ifelse(is.na(month_11), 0, month_11),
                            month_12 = ifelse(is.na(month_12), 0, month_12)),
                     remove_selected_columns = TRUE)
  
  train_r2 <- getTrainPerf(fit.xgb_da)$TrainRsquared
  
  predictions <- predict(fit.xgb_da$finalModel, 
                         newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
  
  test_r2 <- R2(test %>% pull(H2S_daily_avg), predictions)
  
  model_stats <- rbind(model_stats, tibble(Monitor = monitor_names[i], train_r2, test_r2))
}

model_stats %>% rbind(tibble(Monitor = 'Total Average', train_r2 = mean(model_stats$train_r2, na.rm = T),
                             test_r2 = mean(model_stats$test_r2, na.rm = T)))